The objective is to explore the FIFA 19 dataset included in the
DALEX package original
dataset
load('./data/fifa19small.rda')
fifa <- fifa19small |>
rename(Reputation = 9) |>
remove_rownames() |>
column_to_rownames(var = "Name")
fifa
First look at datatset (skim)
skimr::skim(fifa)
| Name | fifa |
| Number of rows | 16924 |
| Number of columns | 44 |
| _______________________ | |
| Column type frequency: | |
| factor | 3 |
| numeric | 41 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Club | 0 | 1 | FALSE | 651 | Ars: 33, Bor: 33, Che: 33, Fro: 33 |
| Position | 0 | 1 | FALSE | 27 | ST: 1980, GK: 1876, CB: 1638, CM: 1296 |
| Preferred.Foot | 0 | 1 | FALSE | 2 | Rig: 12999, Lef: 3925 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Value.EUR | 0 | 1 | 2534477.66 | 5769632.37 | 0 | 325000 | 725000 | 2200000 | 118500000 | ▇▁▁▁▁ |
| Age | 0 | 1 | 25.16 | 4.67 | 16 | 21 | 25 | 28 | 44 | ▅▇▅▁▁ |
| Overall | 0 | 1 | 66.40 | 6.95 | 46 | 62 | 66 | 71 | 94 | ▁▇▇▂▁ |
| Special | 0 | 1 | 1602.17 | 273.85 | 731 | 1462 | 1641 | 1792 | 2346 | ▁▂▇▇▁ |
| Reputation | 0 | 1 | 1.12 | 0.40 | 1 | 1 | 1 | 1 | 5 | ▇▁▁▁▁ |
| Weak.Foot | 0 | 1 | 2.95 | 0.66 | 1 | 3 | 3 | 3 | 5 | ▁▂▇▂▁ |
| Skill.Moves | 0 | 1 | 2.37 | 0.76 | 1 | 2 | 2 | 3 | 5 | ▂▇▆▁▁ |
| Crossing | 0 | 1 | 49.95 | 18.40 | 5 | 38 | 54 | 64 | 93 | ▂▃▆▇▁ |
| Finishing | 0 | 1 | 45.72 | 19.56 | 2 | 30 | 49 | 62 | 95 | ▃▆▇▇▁ |
| HeadingAccuracy | 0 | 1 | 52.41 | 17.42 | 4 | 45 | 56 | 65 | 94 | ▂▂▇▇▁ |
| ShortPassing | 0 | 1 | 58.91 | 14.71 | 7 | 54 | 62 | 68 | 93 | ▁▁▃▇▁ |
| Volleys | 0 | 1 | 43.11 | 17.77 | 4 | 30 | 44 | 57 | 90 | ▃▆▇▆▁ |
| Dribbling | 0 | 1 | 55.58 | 18.94 | 4 | 49 | 61 | 68 | 97 | ▂▁▅▇▁ |
| Curve | 0 | 1 | 47.42 | 18.46 | 6 | 34 | 49 | 62 | 94 | ▃▆▇▇▁ |
| FKAccuracy | 0 | 1 | 43.08 | 17.56 | 4 | 31 | 42 | 57 | 94 | ▃▇▇▅▁ |
| LongPassing | 0 | 1 | 52.94 | 15.32 | 9 | 43 | 56 | 64 | 93 | ▂▃▇▇▁ |
| BallControl | 0 | 1 | 58.60 | 16.71 | 5 | 54 | 63 | 69 | 96 | ▁▁▃▇▁ |
| Acceleration | 0 | 1 | 64.63 | 14.99 | 12 | 57 | 67 | 75 | 97 | ▁▂▃▇▂ |
| SprintSpeed | 0 | 1 | 64.76 | 14.70 | 12 | 57 | 67 | 75 | 96 | ▁▂▃▇▂ |
| Agility | 0 | 1 | 63.59 | 14.80 | 14 | 56 | 66 | 74 | 96 | ▁▂▅▇▂ |
| Reactions | 0 | 1 | 62.01 | 9.03 | 21 | 56 | 62 | 68 | 96 | ▁▂▇▅▁ |
| Balance | 0 | 1 | 63.93 | 14.19 | 16 | 56 | 66 | 74 | 96 | ▁▂▆▇▂ |
| ShotPower | 0 | 1 | 55.67 | 17.26 | 2 | 46 | 59 | 68 | 95 | ▁▂▅▇▁ |
| Jumping | 0 | 1 | 65.15 | 11.86 | 15 | 58 | 66 | 73 | 95 | ▁▁▆▇▂ |
| Stamina | 0 | 1 | 63.31 | 15.92 | 12 | 56 | 67 | 74 | 96 | ▁▂▃▇▂ |
| Strength | 0 | 1 | 65.47 | 12.51 | 17 | 58 | 67 | 74 | 97 | ▁▂▆▇▂ |
| LongShots | 0 | 1 | 47.33 | 19.31 | 3 | 33 | 52 | 63 | 94 | ▃▅▇▇▁ |
| Aggression | 0 | 1 | 56.07 | 17.36 | 11 | 44 | 59 | 69 | 95 | ▂▃▆▇▂ |
| Interceptions | 0 | 1 | 46.88 | 20.70 | 3 | 26 | 52 | 64 | 92 | ▃▅▅▇▁ |
| Positioning | 0 | 1 | 50.16 | 19.57 | 2 | 39 | 55 | 65 | 95 | ▂▂▆▇▁ |
| Vision | 0 | 1 | 53.66 | 14.15 | 10 | 44 | 55 | 64 | 94 | ▁▃▇▆▁ |
| Penalties | 0 | 1 | 48.64 | 15.72 | 5 | 39 | 50 | 60 | 92 | ▂▃▇▆▁ |
| Composure | 0 | 1 | 58.86 | 11.44 | 12 | 51 | 60 | 67 | 96 | ▁▂▇▆▁ |
| Marking | 0 | 1 | 47.38 | 19.91 | 3 | 30 | 53 | 64 | 94 | ▃▅▅▇▁ |
| StandingTackle | 0 | 1 | 47.85 | 21.67 | 2 | 27 | 55 | 66 | 93 | ▃▃▃▇▁ |
| SlidingTackle | 0 | 1 | 45.78 | 21.30 | 3 | 24 | 52 | 64 | 91 | ▅▃▃▇▁ |
| GKDiving | 0 | 1 | 16.60 | 17.68 | 1 | 8 | 11 | 14 | 90 | ▇▁▁▁▁ |
| GKHandling | 0 | 1 | 16.37 | 16.90 | 1 | 8 | 11 | 14 | 92 | ▇▁▁▁▁ |
| GKKicking | 0 | 1 | 16.21 | 16.49 | 1 | 8 | 11 | 14 | 91 | ▇▁▁▁▁ |
| GKPositioning | 0 | 1 | 16.37 | 17.02 | 1 | 8 | 11 | 14 | 90 | ▇▁▁▁▁ |
| GKReflexes | 0 | 1 | 16.69 | 17.94 | 1 | 8 | 11 | 14 | 94 | ▇▁▁▁▁ |
Number of observations (rows): 16,924
Number of variables (columns): 44 (index = player’s name)
Number of numeric/continuous variables: 41
Number of categorical variables: 3 (Club,
Position, Preferred.Foot)
Number of missing values: 0
Target variable (response): value_eur
Histogram of player’s value (Value.EUR)
fifa |>
ggplot(aes(Value.EUR)) +
geom_histogram(fill = 'steelblue')
Histogram of log-transformed player’s value
(Value.EUR)
fifa |>
ggplot(aes(log10(Value.EUR + 0.1))) +
geom_histogram(fill = 'steelblue')
Histograms for selected variables (figure 21.2)
fifa |>
select(Age, BallControl, Dribbling, Reactions) |>
plot_histogram(nrow = 2L, ncol = 2L)
BallControl and Dribbling
are due to goalkeepers.Scatterplots for selected variables (figure 21.3)
fifa19small4long <- fifa19small |>
select(Value.EUR, Age, BallControl, Dribbling, Reactions) |>
gather(variable, value, -Value.EUR)
fifa19small4long |>
ggplot(aes(value, Value.EUR)) +
geom_point() +
geom_smooth(linewidth = 2, se = FALSE, color = 'steelblue') +
theme_drwhy() +
facet_wrap(~variable, ncol = 2, scales = "free") +
scale_y_continuous("Value in Euro", trans = "log10", labels = dollar_format(suffix = "€", prefix = "")) +
scale_x_continuous("") +
ggtitle("Scatterplots for players' characteristics","") +
theme_ema()
Scatterplot matrix for selected variables
library(GGally)
fifa19small |>
select(Age, BallControl, Dribbling, Reactions) |>
ggpairs(
diag = list(continuous = "barDiag"),
lower = list(continuous = wrap("smooth", method = "loess", colour = 'steelblue'))
) +
theme_drwhy() +
ggtitle("Scatterplot matrix for players' characteristics","") +
theme_ema()
Correlation funnel
library(correlationfunnel)
# binarized features
binarized_tbl <- fifa %>%
select(-Club, -Overall) |>
binarize(n_bins = 5, thresh_infreq = 0.01, name_infreq = 'Other', one_hot = TRUE)
# correlate to target ('Value.EUR__3100000_Inf')
corr_tbl <- binarized_tbl %>%
correlate(Value.EUR__3100000_Inf)
# plot correlation
corr_tbl %>%
plot_correlation_funnel(interactive = TRUE)